##Exploratory Data Analysis- Section 4
###Analytical Graphics Principle 1: show comparisons, and don’t forget your control Principle 2: Show causality, mechanism, explanation for differences Principle 3: show multi-variate data Principle 4: Show what you need, and don’t be limited by the tools Principle 5: describe and document your evidence (code, labels, etc…) Principle 6: content is king- the story you tell is most important
##Exploratory Graphs Why? Look at trends, debug an analysis, communicate results, suggest modelling strategies. - made quickly, many are made at once, axes are cleaned later, and colors will change.
## par() changes the global graphics parameters for all graphics. Requires a reset by using the code below:
old.par <- par(mfrow = c(1,1), mfcol = c(1,1), mar = c(5, 4, 4, 2))
par(old.par) ##This code allows you to reset the parameters of a plot in R.
## The list below are all par() commands.
# ls() is the orientation of axis labels on a plot
# bg() is the background color
# mar() defines the margin size
# oma() is the outer margin size
# mfrow() is numbers of plots per row, column, filled row-wise.
# mfcol() is numbers of plots per row. column, filled column-wise.
boxplot(mtcars$mpg, col= "blue") ##example of a boxplot
abline(h = max(mtcars$mpg), col= "orange", lwd = 2) ##h here is horizontal line addition
abline(h = mean(mtcars$mpg), col ="green", lwd = 2) ## lwd is the line width/thickness
##_______________________
boxplot(mpg ~ cyl, data = mtcars, col = "orange") ##plotting the relationship " ~ " between to variables, where mpy is y axis, and cyl is x axis.
abline(h = mean(mtcars$mpg), col = "blue", lwd = 2)
##_______________________
mtcars2 <- transform(mtcars, cyl= factor(cyl))
boxplot(disp ~ cyl, data = mtcars2, xlab = "Cylinders", ylab = "Displacement") ##labeling axes in base R.
##_______________________
hist(mtcars$mpg, col= "orange", breaks = 20, main = "Miles per Gallon accross cars") ##example of a histogram
rug(mtcars$mpg)
abline(v = mean(mtcars$mpg), col ="purple", lwd = 3) ##v stands for vertical line.
abline(v = max(mtcars$mpg), col = "yellow", lwd = 2)
##_______________________
barplot(table(mtcars$gear), col= "turquoise", main = "Number of Gears per Car")
##_______________________
mtcars <- na.omit(mtcars)
with(mtcars, plot(mpg, disp, col=mpg))
with(subset(mtcars, disp > 400), points(mpg, disp, col = "black")) ##xy scatter plot comparing two column variables. we are taking a subset of the mtcars where displacement is greater than 400, and making those points black.
##_______________________
##Adding more details to standard plot, e.g. using the legend expression, and adding regression.
with(mtcars, plot(mpg, disp, main = "Association Between MPG and Displacement", type = "n"))
with(subset(mtcars, cyl == 4), points(mpg, disp, col = "green"))
with(subset(mtcars, cyl != 4), points(mpg, disp, col = "red"))
legend("topright", pch = 1, col = c("green", "red"), legend = c("4 Cylinders", "More than 4 cylinders")) #pch changes the characteristic of the circle/point in the plot.
model <- lm(disp ~ mpg, mtcars) ##creating a liner model between two varialbes, mpg and displacement
abline(model, col = "black", lty = 2)
#abline(h = mean(mtcars$weight), col="orange", lty = 2) ##lty is a dotted line.
#abline(v = mean(mtcars$mpg), col="yellow", lty=2)
##_______________________
par(mfrow = c(3,1), mar = c(5,4,2,1)) ##more to learn here ##starting at bottom and moving clockwise is how margins are oriented. For mfrow, it states one row, three column.
mtcars$gear <- as.integer(mtcars$gear)
with(subset(mtcars, gear == 3), plot(mpg, wt, col = wt, main = "Three Gears"))
with(subset(mtcars, gear == 4), plot(mpg, wt, col = wt, main = "Four Gears"))
with(subset(mtcars, gear == 5), plot(mpg, wt, col = wt, main = "Five Gears"))
##_______________________
old.par <- par(mfrow = c(1,1), mfcol = c(1,1), mar = c(5, 4, 4, 2))
par(old.par) ##resetting
par(mfrow = c(1,3), oma = c(0,0,2,0)) ##gives us a little room on the outer margin for a main title/header.
with(airquality, {
plot(Wind, Ozone, main = "Ozone vs. Wind")
plot(Solar.R, Ozone, main = "Ozone vs. Solar Radiation")
plot(Temp, Ozone, main = "Ozone and Temperature")
mtext("Ozone vs. Weather in NYC", outer = TRUE) ##adds a main title/header to this series of three plots.
})
##_______________________
##Standard plot systems
old.par <- par(mfrow = c(1,1), mfcol = c(1,1), mar = c(5, 4, 4, 2))
par(old.par) ##This code allows you to reset the parameters of a plot in R.
with(cars, plot(speed, dist))
##_______________________
##Standard lattice system
#e.g. xyplot and bwplot
library(lattice)
xyplot(mpg ~ disp | gear, data = mtcars, layout = c(4,1)) ##separating x/y plot by gear
##_______________________
##ggplot2 plotting
library(ggplot2)
data(mtcars)
qplot(disp, mpg, data = mtcars)
Once the plot has been generated… dev.copy(png, file = “filename.png”) dev.off()
##code for histogram of power data, UC Irvine Machine Learning Repository
data_table <- read.table("./data/household_power_consumption.txt", header = TRUE, sep = ";", na.strings = "?") ##separated by semi colons, and the na's are marked as "?".
data_table$Date_Time <- paste(data_table$Date, data_table$Time) ##Paste the first two rows together and renames them as Date_Time
data_table$Global_active_power <- as.numeric(data_table$Global_active_power) ##changes needed data to numeric.
data_table$Date_Time <- strptime(data_table$Date_Time, "%d/%m/%Y %H:%M:%S") #defining time string as time.
data_table <- data_table[data_table$Date %in% c("1/2/2007", "2/2/2007"), ] ##subsets the data for the two dates in question.
head(data_table, n=20) ##quick gut check
hist(data_table$Global_active_power, col= "red", breaks = 14, main = "Global Active Power"
, xlab = "Global Active Power (kilowatts)")
dev.copy(png, file = "plot1_R.png")
## quartz_off_screen
## 3
dev.off()
## quartz_off_screen
## 2
##code for line plot of power data, UC Irvine Machine Learning Repository
data_table <- read.table("./data/household_power_consumption.txt", header = TRUE, sep = ";", na.strings = "?") ##separated by semi colons, and the na's are marked as "?".
data_table$Date_Time <- paste(data_table$Date, data_table$Time) ##Paste the first two rows together and renames them as Date_Time
data_table$Global_active_power <- as.numeric(data_table$Global_active_power) ##changes needed data to numeric.
data_table$Date_Time <- strptime(data_table$Date_Time, "%d/%m/%Y %H:%M:%S") #defining time string as time.
data_table <- data_table[data_table$Date %in% c("1/2/2007", "2/2/2007"), ] ##subsets the data for the two dates in question.
head(data_table, n=20) ##quick gut check
with(data_table, plot(Date_Time, Global_active_power, xlab = "", ylab = "Global Active Power (kilowatts)", type ="l", lwd = 2)) ##defining the plot type as line or "l", and width of 2.
dev.copy(png, file = "plot2_R.png")
## quartz_off_screen
## 3
dev.off()
## quartz_off_screen
## 2
###Plotting + Lines (multi-column) Example, Power Grid
##code for three-variable plot of metering data, UC Irvine Machine Learning Repository
data_table <- read.table("./data/household_power_consumption.txt", header = TRUE, sep = ";", na.strings = "?") ##separated by semi colons, and the na's are marked as "?".
data_table$Date_Time <- paste(data_table$Date, data_table$Time) ##Paste the first two rows together and renames them as Date_Time
data_table$Global_active_power <- as.numeric(data_table$Global_active_power) ##changes needed data to numeric.
data_table$Date_Time <- strptime(data_table$Date_Time, "%d/%m/%Y %H:%M:%S") #defining time string as time.
data_table <- data_table[data_table$Date %in% c("1/2/2007", "2/2/2007"), ] ##subsets the data for the two dates in question.
head(data_table, n=20) ##quick gut check
with(data_table, plot(Date_Time, Sub_metering_1, col= "black", xlab = "", ylab ="Energy sub metering", type = "l"))
lines(data_table$Date_Time, data_table$Sub_metering_2, col="orange", type="l")
lines(data_table$Date_Time, data_table$Sub_metering_3, col="blue", type="l")
legend("topright", c("Sub_metering_1", "Sub_metering_2", "Sub_metering_3"), col = c("black", "orange", "blue"), lty = 1)
dev.copy(png, file = "plot3_R.png")
## quartz_off_screen
## 3
dev.off()
## quartz_off_screen
## 2
###2x2 Plot Example, Power Grid
##code for 2x2 plot of metering data, UC Irvine Machine Learning Repository
data_table <- read.table("./data/household_power_consumption.txt", header = TRUE, sep = ";", na.strings = "?") ##separated by semi colons, and the na's are marked as "?".
data_table$Date_Time <- paste(data_table$Date, data_table$Time) ##Paste the first two rows together and renames them as Date_Time
data_table$Global_active_power <- as.numeric(data_table$Global_active_power) ##changes needed data to numeric.
data_table$Global_reactive_power <- as.numeric(data_table$Global_reactive_power)
data_table$Voltage <- as.numeric(data_table$Voltage)
data_table$Date_Time <- strptime(data_table$Date_Time, "%d/%m/%Y %H:%M:%S") #defining time string as time.
data_table <- data_table[data_table$Date %in% c("1/2/2007", "2/2/2007"), ] ##subsets the data for the two dates in question.
head(data_table, n=20) ##quick gut check
par(mfrow = c(2,2))
with(data_table, plot(Date_Time, Global_active_power, xlab = "", ylab = "Global Active Power (kilowatts)", type ="l", lwd = 2))
with(data_table, plot(Date_Time, Voltage, xlab = "", ylab = "Voltage", type ="l", lwd = 1))
with(data_table, plot(Date_Time, Sub_metering_1, col= "black", xlab = "", ylab ="Energy sub metering", type = "l"))
lines(data_table$Date_Time, data_table$Sub_metering_2, col="orange", type="l")
lines(data_table$Date_Time, data_table$Sub_metering_3, col="blue", type="l")
legend("topright", c("Sub_metering_1", "Sub_metering_2", "Sub_metering_3"), col = c("black", "orange", "blue"))
with(data_table, plot(Date_Time, Global_reactive_power, xlab = "", type ="l", lwd = 1))
dev.copy(png, file = "plot4_R.png")
## quartz_off_screen
## 3
dev.off()
## quartz_off_screen
## 2
##Lattice Plotting xyplot( y~ x | f * g, data) is the standard notation for a lattice plot. left of ~ is trhe y-axis, and on the right is the x-axis f and g are conditioning variables where the * indicates an interaction between two variables
library(lattice)
xyplot(mpg ~ disp, data =mtcars) ##simple scatter plot example using mtcars.
xyplot(Ozone ~ Wind, data = airquality)
##_____________________
airquality <- transform(airquality, Month = factor(Month)) ##Month was turned into a factor that we will ultimately parse the data into.
xyplot(Ozone ~ Wind | Month, data = airquality, layout = c(5,1)) ##xyplot, where month was listed as a conditioning variable.
##ggplot2 plotting Grammar of graphics by Leland Wilkinson (ggplot2.org) aesthetics: size, shape, color geoms: points, lines ###qplot
library(ggplot2)
str(mpg) ##example dataset that comes with ggplot2
## tibble [234 × 11] (S3: tbl_df/tbl/data.frame)
## $ manufacturer: chr [1:234] "audi" "audi" "audi" "audi" ...
## $ model : chr [1:234] "a4" "a4" "a4" "a4" ...
## $ displ : num [1:234] 1.8 1.8 2 2 2.8 2.8 3.1 1.8 1.8 2 ...
## $ year : int [1:234] 1999 1999 2008 2008 1999 1999 2008 1999 1999 2008 ...
## $ cyl : int [1:234] 4 4 4 4 6 6 6 4 4 4 ...
## $ trans : chr [1:234] "auto(l5)" "manual(m5)" "manual(m6)" "auto(av)" ...
## $ drv : chr [1:234] "f" "f" "f" "f" ...
## $ cty : int [1:234] 18 21 20 21 16 18 18 18 16 20 ...
## $ hwy : int [1:234] 29 29 31 30 26 26 27 26 25 28 ...
## $ fl : chr [1:234] "p" "p" "p" "p" ...
## $ class : chr [1:234] "compact" "compact" "compact" "compact" ...
qplot(displ, hwy, data = mpg) ##most basic of "base" plots for ggplot2. displ = x-coordinate, hwy = y-coordinate.
qplot(displ, hwy, data = mpg, color = drv) ##adds a legend where colors are specific to variable drv (front, rear or 4w)
qplot(displ, hwy, data = mpg, geom = c("point", "smooth")) ## grey = 95% confidence interval, and "smooth" is simply the function for the line.
## `geom_smooth()` using method = 'loess' and formula 'y ~ x'
qplot( hwy, data = mpg, fill = drv) ##when only one variable is defined, it will produce a histogram. fill is an alternative to color when using a histogpram.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
##Facets_______________
qplot(displ, hwy, data = mpg, facets = .~drv) ## here, the facets function splits "." or everything by ~ the variable drv
qplot(hwy, data= mpg, facets = drv~., binwidth= 2, color = drv)## here, histogram where facet used to separate drv ~ by "." or everything
##Geoms______________________
qplot(hwy, data = mpg, geom="density", color = drv) ##example of geom "density" which is a smooth plot alternative to histogram, can accept y variable only, notx and y
qplot(displ, hwy, data = mpg, color = drv) + geom_smooth(method = "lm")
## `geom_smooth()` using formula 'y ~ x'
qplot(displ, hwy, data = mpg, facets = .~drv, color = drv) + geom_smooth(method = "lm") ## using facets to separate out the regression by drv. lm for linear model.
## `geom_smooth()` using formula 'y ~ x'
qplot(displ, hwy, data = mpg, facets = .~ cyl, color = cyl, geom = c("point", "smooth"), method = "lm") ##again, point and smooth gives us the confidence interval, along with a line "lm" for linear model.
## Warning: Ignoring unknown parameters: method
## `geom_smooth()` using formula 'y ~ x'
##______________________
g <- ggplot(mpg, aes(hwy, cyl)) ##aes stands for the aesthetic elements, i.e. what and it's orientation.
summary(g)
## data: manufacturer, model, displ, year, cyl, trans, drv, cty, hwy, fl,
## class [234x11]
## mapping: x = ~hwy, y = ~cyl
## faceting: <ggproto object: Class FacetNull, Facet, gg>
## compute_layout: function
## draw_back: function
## draw_front: function
## draw_labels: function
## draw_panels: function
## finish_data: function
## init_scales: function
## map_data: function
## params: list
## setup_data: function
## setup_params: function
## shrink: TRUE
## train_scales: function
## vars: function
## super: <ggproto object: Class FacetNull, Facet, gg>
##print(g) will return an error because there is insufficient information to provide an output.
p <- g + geom_point() ##point layer added to g
print(p)
g + geom_point() + geom_smooth() ##smooths out and adds confidence bands.
## `geom_smooth()` using method = 'loess' and formula 'y ~ x'
g + geom_point() + geom_smooth(method = "lm") ##adds a linear model best-fit line
## `geom_smooth()` using formula 'y ~ x'
##___________________
h <- ggplot(mpg, aes(displ, drv))
h + geom_point(color = "forestgreen", size = 4, alpha= 1/2) + ##color of this variable is a constant - green
#facet_grid(. ~ cyl) +
geom_smooth(method = "lm") +
xlab("Displacement") + ##labels the x axis
ylab("Drive_Type") +
ggtitle("Engine Attributes")
## `geom_smooth()` using formula 'y ~ x'
h <- ggplot(mpg, aes(displ, cty))
h + geom_point(aes(color = cyl), size = 4, alpha= 1/2) + ###assigning a color to a variable requires aes(color = column name), and the alpha is the transparency, size is just that.
theme_classic(base_family = "Avenir", base_size = 14) + ##changes the base font for the entire plot
geom_smooth(size = 1.5, linetype = 6, color = "orange", method = "lm") +
xlab("Displacement") + ##labels the x axis
ylab("City_MPG") +
ggtitle("Engine Attributes") +
ylim(-5, 30) ##sets a limit to the y axis, but in ggplot, that will subset the data and remove outlier points.
## `geom_smooth()` using formula 'y ~ x'
## Warning: Removed 2 rows containing non-finite values (stat_smooth).
## Warning: Removed 2 rows containing missing values (geom_point).
## + coor_cartesian(ylim = c(-5, 30)) This is a way of preventing sub-setting and outliers are included in the finished plot.
##facet_wrap( column/category ~ column/category2, nrow = y, ncol = x)
##Heirarchical Clustering [i.e. cluster analysis] - How do we define close? - Continuous = euclidian distance (straight line) a^2 + b^2 = c^2 - Continuous = correlation similarity - Binary = manhattan distance. |A1 - A2| + |B1 - B2| + …. |Z1 - Z2| absolute values - think walking city blocks. - How do we group things? - How to we visualize the groupings? - How do we interpret the grouping?
Taking an agglomerative approach (bottom up), find closes two things, put them into larger clusters, find next closest, etc… - Required a defined distance - A merging approach Produced: a tree showing how close things are to each other.
e.g. dist(mpg) gives you the pairwise differences between each variable in each column.
###Dendrogram - rudimentary
distxy <- dist(mpg$cty)
hClustering <- hclust(distxy)
plot(hClustering) ##this clustering does not necessarily demonsrate anything significant in terms of output or trends, but will at least produce a figure.
#clusterMatrix <- as.matrix(hClustering[ , 8:9])
#heatmap(clusterMatrix)
set.seed(1234)
par(mar = c(0,0,0,0))
x <- rnorm(12, mean= rep(1:3, each = 4), sd = 0.2)
y <- rnorm(12, mean = rep(c(1,2,1), each = 4), sd = 0.2)
plot(x,y, col= "orange", pch= 5, cex = 2)
text(x + 0.08, y + 0.08, labels = as.character(1:12))
Data_thing <- data.frame( x=x, y=y)
data_dist <- dist(Data_thing)
thing_cluster <- hclust(data_dist)
plot(thing_cluster)
thing_matrix <- as.matrix(Data_thing) [sample(1:12), ]
heatmap(thing_matrix)
##k-means Clustering & Dimension reduction Partitions -fix a number of clusters - get “centroids” of each cluster - assign things to closest centroid - recalculate centroids and iterate Requires - A defined distance metric - a number of clusters - an initial guess as to cluster centroids Produces - Final estimate of cluster and assignments
###kmeans Clustering
##Generating random data here
set.seed(1234)
par(mar = c(0,0,0,0))
x <- rnorm(12, mean= rep(1:3, each = 4), sd = 0.2)
y <- rnorm(12, mean = rep(c(1,2,1), each = 4), sd = 0.2)
plot(x,y, col= "orange", pch= 5, cex = 2)
text(x + 0.08, y + 0.08, labels = as.character(1:12))
##kmeans function/argument for clustering analyses
data_thing <- data.frame(x = x, y = y)
kmeansObject <- kmeans(data_thing, centers = 3) ##caling kmeans argument, three centers/groups
names(kmeansObject)
## [1] "cluster" "centers" "totss" "withinss" "tot.withinss"
## [6] "betweenss" "size" "iter" "ifault"
par(mar = rep(0.2, 4)) #adjustes the margins
plot( x, y, col= kmeansObject$cluster, pch = 19, cex = 2) #plots the kmeans function, calling out the clusters
points(kmeansObject$centers, col = 1:3, pch = 3, lwd = 3) # plots the points that represent the cluster centers
Not much to report here, lol.
library(grDevices) ##takes blending of primary and other colors and blend them together.
# colorRamp (e.g. gray)
# colorRampPalette (e.g. heat.colors)
# colors()
color1 <- colorRamp(c("red", "blue"))
color1(0) ##will return a matrix of three values - Red Green Blue in that order, where it's all red.
## [,1] [,2] [,3]
## [1,] 255 0 0
color1(1) ##will return a matrix that is all blue.
## [,1] [,2] [,3]
## [1,] 0 0 255
color1(0.5) # will return a blend of both red and blue, but no green.
## [,1] [,2] [,3]
## [1,] 127.5 0 127.5
color1(seq(0,1, len = 10)) ## generated a sequence, length of 10, of a sequence between all red to all blue.
## [,1] [,2] [,3]
## [1,] 255.00000 0 0.00000
## [2,] 226.66667 0 28.33333
## [3,] 198.33333 0 56.66667
## [4,] 170.00000 0 85.00000
## [5,] 141.66667 0 113.33333
## [6,] 113.33333 0 141.66667
## [7,] 85.00000 0 170.00000
## [8,] 56.66667 0 198.33333
## [9,] 28.33333 0 226.66667
## [10,] 0.00000 0 255.00000
library(RColorBrewer)
cols <- brewer.pal(15, "RdYlGn") ##brewer.pal the only useful argument in this package. colorbrewer.org has the list of available color shchema.
## Warning in brewer.pal(15, "RdYlGn"): n too large, allowed maximum for palette RdYlGn is 11
## Returning the palette you asked for with that many colors
pal <-colorRampPalette(cols) ##rendering the colors into Ramp Pallete
image(volcano, col= pal(15)) ##asking the colors to be those rendered into pal, and there will be 10 different colors.
x <- rnorm(1000)
y <- rnorm(1000)
plot(x, y, col = rgb(.1,0,.3,.2), pch = 19) ##RGB looks at red, green, blue as numeric, and last is the alpha of 0.2
list.files("./data/UCI HAR Dataset/")
## character(0)
datapath <-file.path("/Users/payashome/Documents/FMDtRH/R Studio/R Tutorials/R_4_DataScience/data", "UCI HAR Dataset 2")
files <- list.files(datapath, recursive = TRUE) ##lists all files in the UCI folder
files
## [1] "activity_labels.txt"
## [2] "features_info.txt"
## [3] "features.txt"
## [4] "README.txt"
## [5] "test/Inertial Signals/body_acc_x_test.txt"
## [6] "test/Inertial Signals/body_acc_y_test.txt"
## [7] "test/Inertial Signals/body_acc_z_test.txt"
## [8] "test/Inertial Signals/body_gyro_x_test.txt"
## [9] "test/Inertial Signals/body_gyro_y_test.txt"
## [10] "test/Inertial Signals/body_gyro_z_test.txt"
## [11] "test/Inertial Signals/total_acc_x_test.txt"
## [12] "test/Inertial Signals/total_acc_y_test.txt"
## [13] "test/Inertial Signals/total_acc_z_test.txt"
## [14] "test/subject_test.txt"
## [15] "test/X_test.txt"
## [16] "test/y_test.txt"
## [17] "train/Inertial Signals/body_acc_x_train.txt"
## [18] "train/Inertial Signals/body_acc_y_train.txt"
## [19] "train/Inertial Signals/body_acc_z_train.txt"
## [20] "train/Inertial Signals/body_gyro_x_train.txt"
## [21] "train/Inertial Signals/body_gyro_y_train.txt"
## [22] "train/Inertial Signals/body_gyro_z_train.txt"
## [23] "train/Inertial Signals/total_acc_x_train.txt"
## [24] "train/Inertial Signals/total_acc_y_train.txt"
## [25] "train/Inertial Signals/total_acc_z_train.txt"
## [26] "train/subject_train.txt"
## [27] "train/X_train.txt"
## [28] "train/y_train.txt"
library(dplyr)
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
library(data.table)
##
## Attaching package: 'data.table'
## The following objects are masked from 'package:dplyr':
##
## between, first, last
## We are going to read in the train, test, features and activities as seperate
#setwd("./data/UCI HAR Dataset 2")
x_train <- read.table(file.path(datapath, "train", "X_train.txt"), header = FALSE) ##here, file.path saves time by allowing the UCI HAR to be accesses as datapath, in place of pasting the complete filepath.
y_train <- read.table(file.path(datapath, "train", "Y_train.txt"), header = FALSE) ## reading in the y train data
train_sub <- read.table(file.path(datapath, "train", "subject_train.txt"), header = FALSE)
##_________________
x_test <- read.table(file.path(datapath, "test", "X_test.txt"), header = FALSE) ##reading in data for test, same as above.
y_test <- read.table(file.path(datapath, "test", "Y_test.txt"), header = FALSE)
test_sub <- read.table(file.path(datapath, "test", "subject_test.txt"), header = FALSE)
##_________________
features <- read.table(file.path(datapath, "features.txt"), header = FALSE) ##reading in additional files, no files denoted because they're already in datapath.
actLabel <- read.table(file.path(datapath, "activity_labels.txt"), header = FALSE)
colnames(actLabel) <- c('activityID', 'activityTYPE')
colnames(x_train) = features[ ,2] ##defining the column names as a function of features, which has two columns, so the column name for x_train will come from the list of names in column 2 of features.
colnames(y_train) = "activityID"
y_train <- left_join(y_train, actLabel, by = "activityID") ##Left joining two data.frames using actLabel as the key, where both column ID's have to match.
colnames(train_sub) = "subjectID"
colnames(x_test) = features[ ,2] ##column names are same as in train.
colnames(y_test) = "activityID" #descriptive column names for the activity type.
y_test <- left_join(y_test, actLabel, by ="activityID")
colnames(test_sub) = "subjectID" #descriptive column name for the individual, i.e. 1 of 30 participants.
##simply giving column names to the actLabel data.frame
combine_train <- cbind(y_train, train_sub, x_train) ##when str(combine_train), we see activityID as first column (y_train), then the participant ID, then the numerous data columns (x_train)
combine_test <- cbind(y_test, test_sub, x_test)
complete_data <- rbind(combine_train, combine_test) ##row binds the two datasets together.
names(complete_data) <- gsub("^t", "time", names(complete_data)) ##any string (i.e. column name) beginning with a t changed to time.
names(complete_data) <- gsub("^f", "frequency", names(complete_data)) ## same for frequency
names(complete_data) <- gsub("Acc", "Accelerometer", names(complete_data))
names(complete_data) <- gsub("Gyro", "Gyroscope", names(complete_data))
names(complete_data) <- gsub("Mag", "Magnitude", names(complete_data))
names(complete_data) <- gsub("BodyBody", "body", names(complete_data))
write.table(complete_data, file ="tidydata_samsung", row.name=FALSE)
head(complete_data, n=20)
##___________
par(mfrow = c(1,2), mar = c(5,4,2,1))
complete_data <- transform(complete_data, activityTYPE = factor(activityTYPE))
sub1 <- subset(complete_data, subjectID == 1)
plot(sub1[, 4], col= sub1$activityID, ylab = names(sub1)[4])
plot(sub1[, 5], col = sub1$activityID, ylab = names(sub1)[5])
op <- par(cex = .5) ##changes the font size for the legend of the plot.
legend("bottomright", legend = unique(sub1$activityTYPE), col = unique(sub1$activityTYPE), pch = 1)
par(mfrow = c(1,2), mar = c(5,4,2,1)) ##looking at the max acceleration in x and y for subject 1
complete_data <- transform(complete_data, activityTYPE = factor(activityTYPE))
sub1 <- subset(complete_data, subjectID == 1)
plot(sub1[, 13], col= sub1$activityID, ylab = names(sub1)[13])
plot(sub1[, 14], col = sub1$activityID, ylab = names(sub1)[14])
op <- par(cex = .5) ##changes the font size for the legend of the plot.
legend("topright", legend = unique(sub1$activityTYPE), col = unique(sub1$activityTYPE), pch = 19)
pm0 <- read.table("./data/PM2-5_2000_2019_annual.txt", header = TRUE, sep= "|", na.strings = "NA", fill = TRUE) ##for a read.table, it expects an element in each column, therefore it is crucial to fill = TRUE in order to fill it in, otherwise it won't be read.
head(pm0, n= 20)
library(dplyr)
pm0 <- pm0[pm0$year %in% c("2000","2019"), ] ##subsetting the data.frame to include two years only. careful, year is denoted as an integer here. strptime not working well for a conversion.
#names(pm0) <- names(pm0) %>% make.names() ##renames column names due to abundance of spaces. Fills them in with "_".
head(pm0, n=20)
##taking summary view of sample values.
x0 <- pm0$arithmetic_mean
summary(x0)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 0.915 7.039 8.833 9.895 12.563 28.220
str(x0)
## num [1:1309] 11.51 6.77 6.77 10.29 7.92 ...
mean(is.na(x0))
## [1] 0
##subsetting the data into two data.frames, and then boxplotting each on same plot.
pm0$year <- as.integer(pm0$year)
x2000 <- subset(pm0, pm0$year == 2000)
x2019 <- subset(pm0, pm0$year == 2019)
boxplot( x2000$arithmetic_mean, x2019$arithmetic_mean, col="blue", pch = 19)
## ______________ Drilling down into the data_______________##
negative <- x2000$arithmetic_mean < 0
str(negative)
## logi [1:565] FALSE FALSE FALSE FALSE FALSE FALSE ...
sum(negative, na.rm = TRUE)
## [1] 0
mean(negative, na.rm = TRUE)
## [1] 0
hist(x2000$arithmetic_mean, xlab = "PM2.5 (micrograms/cubic meter)") ##getting a view of the data
site2000 <- unique(subset(x2000, state_code == 36, c(county_code, si_id))) ##returns two column data.frame unique to the state, and showing county and site id's.
head(site2000)
site2000 <- paste(site2000[,1], site2000[,2], sep = ".") ##pasting together string/number from col 1 to col 2, returning a "number.number" that is the combination of the two.
negative2 <- x2019$arithmetic_mean < 0
str(negative)
## logi [1:565] FALSE FALSE FALSE FALSE FALSE FALSE ...
sum(negative, na.rm = TRUE)
## [1] 0
mean(negative, na.rm = TRUE)
## [1] 0
hist(x2019$arithmetic_mean, xlab = "PM2.5 (micrograms/cubic meter)")
site2019 <- unique(subset(x2019, state_code == 36, c(county_code, si_id))) ##returns two column data.frame unique to the state, and showing county and site id's.
head(site2019)
site2019 <- paste(site2019[,1], site2019[,2], sep = ".")
both <- intersect(site2019, site2000) ##here- looks at the county, and site monitors that were present in both 2000 and 2019, and the return shows that there were 7.
str(both)
## chr [1:7] "085.10983" "067.10901" "005.10494" "031.10632" "001.10461" ...
x2000$county_site <- with(x2000, paste(county_code, si_id, sep = ".")) ##creating new columns that are the combo of county and site (from above)
x2019$county_site <- with(x2019, paste(county_code, si_id, sep = "."))
ct2000 <- subset(x2000, state_code == 36 & county_site %in% both)
ct2019 <- subset(x2019, state_code == 36 & county_site %in% both)
sapply(split(ct2000, ct2000$county_site), nrow) ## if these were not annual averages, but a collection of dates and n number of observations, the sapply on the split would show us how many observations we had per station in "both".
## 001.10461 001.10468 005.10494 031.10632 061.10801 067.10901 085.10983
## 1 1 2 1 1 1 1
sapply(split(ct2019, ct2019$county_site), nrow)
## 001.10461 001.10468 005.10494 031.10632 061.10801 067.10901 085.10983
## 1 1 1 1 1 2 1
key <- readRDS("Source_Classification_Code.rds")
data0 <- readRDS("summarySCC_PM25.rds")
head(key, n=20)
head(data0, n=20)
str(key)
## 'data.frame': 11717 obs. of 15 variables:
## $ SCC : Factor w/ 11717 levels "10100101","10100102",..: 1 2 3 4 5 6 7 8 9 10 ...
## $ Data.Category : Factor w/ 6 levels "Biogenic","Event",..: 6 6 6 6 6 6 6 6 6 6 ...
## $ Short.Name : Factor w/ 11238 levels "","2,4-D Salts and Esters Prod /Process Vents, 2,4-D Recovery: Filtration",..: 3283 3284 3293 3291 3290 3294 3295 3296 3292 3289 ...
## $ EI.Sector : Factor w/ 59 levels "Agriculture - Crops & Livestock Dust",..: 18 18 18 18 18 18 18 18 18 18 ...
## $ Option.Group : Factor w/ 25 levels "","C/I Kerosene",..: 1 1 1 1 1 1 1 1 1 1 ...
## $ Option.Set : Factor w/ 18 levels "","A","B","B1A",..: 1 1 1 1 1 1 1 1 1 1 ...
## $ SCC.Level.One : Factor w/ 17 levels "Brick Kilns",..: 3 3 3 3 3 3 3 3 3 3 ...
## $ SCC.Level.Two : Factor w/ 146 levels "","Agricultural Chemicals Production",..: 32 32 32 32 32 32 32 32 32 32 ...
## $ SCC.Level.Three : Factor w/ 1061 levels "","100% Biosolids (e.g., sewage sludge, manure, mixtures of these matls)",..: 88 88 156 156 156 156 156 156 156 156 ...
## $ SCC.Level.Four : Factor w/ 6084 levels "","(NH4)2 SO4 Acid Bath System and Evaporator",..: 4455 5583 4466 4458 1341 5246 5584 5983 4461 776 ...
## $ Map.To : num NA NA NA NA NA NA NA NA NA NA ...
## $ Last.Inventory.Year: int NA NA NA NA NA NA NA NA NA NA ...
## $ Created_Date : Factor w/ 57 levels "","1/27/2000 0:00:00",..: 1 1 1 1 1 1 1 1 1 1 ...
## $ Revised_Date : Factor w/ 44 levels "","1/27/2000 0:00:00",..: 1 1 1 1 1 1 1 1 1 1 ...
## $ Usage.Notes : Factor w/ 21 levels ""," ","includes bleaching towers, washer hoods, filtrate tanks, vacuum pump exhausts",..: 1 1 1 1 1 1 1 1 1 1 ...
str(data0)
## 'data.frame': 6497651 obs. of 6 variables:
## $ fips : chr "09001" "09001" "09001" "09001" ...
## $ SCC : chr "10100401" "10100404" "10100501" "10200401" ...
## $ Pollutant: chr "PM25-PRI" "PM25-PRI" "PM25-PRI" "PM25-PRI" ...
## $ Emissions: num 15.714 234.178 0.128 2.036 0.388 ...
## $ type : chr "POINT" "POINT" "POINT" "POINT" ...
## $ year : int 1999 1999 1999 1999 1999 1999 1999 1999 1999 1999 ...
summary(data0)
## fips SCC Pollutant Emissions
## Length:6497651 Length:6497651 Length:6497651 Min. : 0.0
## Class :character Class :character Class :character 1st Qu.: 0.0
## Mode :character Mode :character Mode :character Median : 0.0
## Mean : 3.4
## 3rd Qu.: 0.1
## Max. :646952.0
## type year
## Length:6497651 Min. :1999
## Class :character 1st Qu.:2002
## Mode :character Median :2005
## Mean :2004
## 3rd Qu.:2008
## Max. :2008
library(plotrix) ##allows us to have a break in the
#with(data0, gap.plot(year, Emissions, gap=c(10,100), col= year, pch = 1, ylim = c(-1, 200), xlab = "Year", ylab = "Emissions", main ="Emissions over time- USA"))
#legend("topright", pch = 1, col = unique(data0$year), legend = unique(data0$year))
#model <- lm(Emissions ~ year, data0) ##creating a liner model between two variables
#abline(model, col = "purple", lty = 2)
#scipen = 5 ##for turning off scientific notation.
#dev.copy(png, file = "plot1.png")
#dev.off()
data1 <- subset(data0, fips == "24510")
with(data1, gap.plot(year, Emissions, gap=c(10,100), col= year, pch = 1, ylim = c(-1, 200), xlab = "Year", ylab = "Emissions", main ="Emissions over time- Baltimore City"))
## Warning in gap.plot(year, Emissions, gap = c(10, 100), col = year, pch = 1, :
## some values of y may not be displayed
legend("topright", pch = 1, col = unique(data1$year), legend = unique(data1$year))
model <- lm(Emissions ~ year, data1) ##creating a liner model between two variables
abline(model, col = "purple", lty = 2)
scipen = 5 ##for turning off scientific notation.
dev.copy(png, file = "plot2.png")
## quartz_off_screen
## 3
dev.off()
## quartz_off_screen
## 2
###________________________
library(ggplot2)
data2 <- subset(data0, fips == "24510" & type == "POINT")
data3 <- subset(data0, fips == "24510" & type == "NONPOINT")
data4 <- subset(data0, fips == "24510" & type == "ON-ROAD")
data5 <- subset(data0, fips == "24510" & type == "NON-ROAD")
library(RColorBrewer)
cols <- brewer.pal(6, "RdYlGn") ##brewer.pal the only useful argument in this package. colorbrewer.org has the list of available color shchema.
pal <-colorRampPalette(cols) ##rendering the colors into Ramp Pallete
g1 <- ggplot(data2, aes(year, Emissions))
p1 <- g1 + geom_point(aes(color = year), size = 4, alpha= 1/2) + ###assigning a color to a variable requires aes(color = column name), and the alpha is the transparency, size is just that.
theme_classic(base_family = "Times", base_size = 9) + ##changes the base font for the entire plot
scale_fill_brewer(palette = "RdYlBu") + ##attempt to use brewer color schemes... doesn't seem to work.
geom_smooth(size = 1.5, linetype = 6, color = "orange", method = "lm") +
xlab("Year") + ##labels the x axis
ylab("Emissions") +
ggtitle("Baltimore City Emissions by 'POINT'") +
ylim(-1, 30)
g2 <- ggplot(data3, aes(year, Emissions))
p2 <- g2 + geom_point(aes(color = year), size = 4, alpha= 1/2) + ###assigning a color to a variable requires aes(color = column name), and the alpha is the transparency, size is just that.
theme_classic(base_family = "Times", base_size = 9) + ##changes the base font for the entire plot
scale_fill_brewer(palette = "RdYlBu") + ##attempt to use brewer color schemes... doesn't seem to work.
geom_smooth(size = 1.5, linetype = 6, color = "orange", method = "lm") +
xlab("Year") + ##labels the x axis
ylab("Emissions") +
ggtitle("Baltimore City Emissions by 'NONPOINT'") +
ylim(-1, 30)
g3 <- ggplot(data4, aes(year, Emissions))
p3 <- g3 + geom_point(aes(color = year), size = 4, alpha= 1/2) + ###assigning a color to a variable requires aes(color = column name), and the alpha is the transparency, size is just that.
theme_classic(base_family = "Times", base_size = 9) + ##changes the base font for the entire plot
scale_fill_brewer(palette = "RdYlBu") + ##attempt to use brewer color schemes... doesn't seem to work.
geom_smooth(size = 1.5, linetype = 6, color = "orange", method = "lm") +
xlab("Year") + ##labels the x axis
ylab("Emissions") +
ggtitle("Baltimore City Emissions by 'ONROAD'") +
ylim(-1, 30)
g4 <- ggplot(data5, aes(factor(year), Emissions))
p4 <- g4 + geom_point(aes(color = year), size = 4, alpha= 1/2) + ###assigning a color to a variable requires aes(color = column name), and the alpha is the transparency, size is just that.
theme_classic(base_family = "Times", base_size = 9) + ##changes the base font for the entire plot
scale_fill_brewer(palette = "RdYlBu") + ##attempt to use brewer color schemes... doesn't seem to work.
geom_smooth(size = 1.5, linetype = 6, color = "orange", method = "lm", na.rm= TRUE) +
xlab("Year") + ##labels the x axis
ylab("Emissions") +
ggtitle("Baltimore City Emissions by 'NONROAD'") +
ylim(-1, 30)
library(cowplot)
plot_grid(p1, p2, p3, p4, labels= "", ncol=2, nrow=2)
## `geom_smooth()` using formula 'y ~ x'
## Warning: Removed 19 rows containing non-finite values (stat_smooth).
## Warning: Removed 19 rows containing missing values (geom_point).
## `geom_smooth()` using formula 'y ~ x'
## Warning: Removed 35 rows containing non-finite values (stat_smooth).
## Warning: Removed 35 rows containing missing values (geom_point).
## `geom_smooth()` using formula 'y ~ x'
## Warning: Removed 1 rows containing non-finite values (stat_smooth).
## Warning: Removed 1 rows containing missing values (geom_point).
## `geom_smooth()` using formula 'y ~ x'
## Warning: Removed 6 rows containing missing values (geom_point).
dev.copy(png, file = "plot3.png")
## quartz_off_screen
## 3
dev.off()
## quartz_off_screen
## 2
#####___________ Alternative ggplot
data1 <- subset(data0, fips == "24510") ##subset for baltimore
agdata1 <- aggregate(Emissions ~ year, data1, sum) ##not taht important for this plot.
gg1 <- ggplot(data1, aes(factor(year), Emissions, fill=type)) ##type is a categorical variable/column header.
gp1 <- gg1 + geom_bar(stat="identity") + ##identity leave the data unchanged. The default is to count
theme_bw() +
guides(fill = "none") + ##gets rid of the legend on the right side.
facet_grid(.~type, scales = "free", space = "free")+ ##grid is organized by "." or everything, by type(non-road, etc...). Separates 4, 4-olored columns into four grids, each with 4 colors.
xlab("Year") + ##labels the x axis
ylab("Emissions")+
ggtitle("PM2.5 Emissions, Baltimore City")
gp1
library(plotrix) ##allows us to have a break in the
library(data.table)
library(dplyr)
data6 <- left_join(data0, key, by = "SCC")
data7 <- data6[data6$SCC.Level.Three %like% "coal", ]
with(data7, gap.plot(year, Emissions, gap=c(100,200), col= year, pch = 1, ylim = c(-1, 300), xlab = "Year", ylab = "Emissions", main ="Coal Use over time- USA"))
## Warning in gap.plot(year, Emissions, gap = c(100, 200), col = year, pch = 1, :
## some values of y may not be displayed
legend("topright", pch = 1, col = unique(data6$year), legend = unique(data6$year))
model <- lm(Emissions ~ year, data7) ##creating a liner model between two variables
abline(model, col = "purple", lty = 2)
scipen = 5 ##for turning off scientific notation.
dev.copy(png, file = "plot4.png")
## quartz_off_screen
## 3
dev.off()
## quartz_off_screen
## 2
library(plotrix) ##allows us to have a break in the
library(data.table)
library(dplyr)
par(mfrow = c(1,1))
data6 <- left_join(data0, key, by = "SCC")
data8 <- subset(data6, fips == "24510" & SCC.Level.One == "Internal Combustion Engines")
with(data8, plot(year, Emissions, col= year, pch = 1, cex = 3, xlab = "Year", ylab = "Emissions", main ="Combusion Engine Part. over time- Maryland City")) ##cex changes the size of the points.
legend("topright", pch = 1, col = unique(data8$year), legend = unique(data8$year))
model <- lm(Emissions ~ year, data8) ##creating a liner model between two variables
abline(model, col = "purple", lty = 2)
scipen = 5 ##for turning off scientific notation.
dev.copy(png, file = "plot5.png")
## quartz_off_screen
## 3
dev.off()
## quartz_off_screen
## 2
par(mfrow = c(2,1))
data6 <- left_join(data0, key, by = "SCC")
data9 <- subset(data6, fips == ("24510") & SCC.Level.One == "Internal Combustion Engines")
with(data9, plot(year, Emissions, col= fips, pch = 1, xlab = "Year", ylab = "Emissions", main ="Combusion Engine Part. Maryland City"))
legend("topright", pch = 1, col = unique(data9$fips), legend = unique(data9$fips))
model <- lm(Emissions ~ year, data9) ##creating a liner model between two variables
abline(model, col = "purple", lty = 2)
scipen = 5
data10 <- subset(data6, fips == ("06037") & SCC.Level.One == "Internal Combustion Engines")
with(data10, plot(year, Emissions, col= fips, pch = 1, xlab = "Year", ylab = "Emissions", main ="Combusion Engine Part. LA County"))
legend("topright", pch = 1, col = unique(data10$fips), legend = unique(data10$fips))
model <- lm(Emissions ~ year, data10)
abline(model, col = "orange", lty = 2)
scipen = 5 ##for turning off scientific notation.
dev.copy(png, file = "plot6.png")
## quartz_off_screen
## 3
dev.off()
## quartz_off_screen
## 2
Reading in the data.frame
data0 <- na.omit(read.csv("./data/activity.csv")) ##reading in data
data0$date <- as.Date(data0$date)
str(data0)
## 'data.frame': 15264 obs. of 3 variables:
## $ steps : int 0 0 0 0 0 0 0 0 0 0 ...
## $ date : Date, format: "2012-10-02" "2012-10-02" ...
## $ interval: int 0 5 10 15 20 25 30 35 40 45 ...
## - attr(*, "na.action")= 'omit' Named int [1:2304] 1 2 3 4 5 6 7 8 9 10 ...
## ..- attr(*, "names")= chr [1:2304] "1" "2" "3" "4" ...
head(data0, n=20)
Total Steps per day
s <- split(data0, data0$date)
sapply(s, function(x) colSums(x[, c("steps", "interval")], na.rm = TRUE)) ##calculates total steps for eacy day.
## 2012-10-02 2012-10-03 2012-10-04 2012-10-05 2012-10-06 2012-10-07
## steps 126 11352 12116 13294 15420 11015
## interval 339120 339120 339120 339120 339120 339120
## 2012-10-09 2012-10-10 2012-10-11 2012-10-12 2012-10-13 2012-10-14
## steps 12811 9900 10304 17382 12426 15098
## interval 339120 339120 339120 339120 339120 339120
## 2012-10-15 2012-10-16 2012-10-17 2012-10-18 2012-10-19 2012-10-20
## steps 10139 15084 13452 10056 11829 10395
## interval 339120 339120 339120 339120 339120 339120
## 2012-10-21 2012-10-22 2012-10-23 2012-10-24 2012-10-25 2012-10-26
## steps 8821 13460 8918 8355 2492 6778
## interval 339120 339120 339120 339120 339120 339120
## 2012-10-27 2012-10-28 2012-10-29 2012-10-30 2012-10-31 2012-11-02
## steps 10119 11458 5018 9819 15414 10600
## interval 339120 339120 339120 339120 339120 339120
## 2012-11-03 2012-11-05 2012-11-06 2012-11-07 2012-11-08 2012-11-11
## steps 10571 10439 8334 12883 3219 12608
## interval 339120 339120 339120 339120 339120 339120
## 2012-11-12 2012-11-13 2012-11-15 2012-11-16 2012-11-17 2012-11-18
## steps 10765 7336 41 5441 14339 15110
## interval 339120 339120 339120 339120 339120 339120
## 2012-11-19 2012-11-20 2012-11-21 2012-11-22 2012-11-23 2012-11-24
## steps 8841 4472 12787 20427 21194 14478
## interval 339120 339120 339120 339120 339120 339120
## 2012-11-25 2012-11-26 2012-11-27 2012-11-28 2012-11-29
## steps 11834 11162 13646 10183 7047
## interval 339120 339120 339120 339120 339120
##you can also use aggregate
data1 <- aggregate(data0$steps, by = list(Steps.Date = data0$date), FUN ="sum")
data1 <- as.data.frame(data1)
colnames(data1) <- c("Date", "Sum_Steps")
head(data1)
hist(data1$Sum_Steps, col= "orange", breaks=36, xlab="No. of Steps", main= "Sum of Steps per Day")
mean_step <- mean(data1$Sum_Steps)
mean_step
## [1] 10766.19
median_step <- median(data1$Sum_Steps)
median_step
## [1] 10765
Plotting Average Steps
data2 <- aggregate(data0$steps, by = list(Interval = data0$interval), FUN ="mean")
colnames(data2) <- c("Interval", "Average_Steps")
head(data2)
with(data2, plot(Interval, Average_Steps, col = "purple",
main= "Average Daily Activity",
xlab= "5min Intervals",
ylab="Average no. Steps",
type= "l"))
max <- which.max(data2$Average_Steps)
max2 <- data2[max,1]
max2
## [1] 835
Working with NA’s
dataNA <- read.csv("./data/activity.csv")
length(which(is.na(dataNA))) ##number of NA's
## [1] 2304
library(Hmisc)
## Loading required package: survival
## Loading required package: Formula
##
## Attaching package: 'Hmisc'
## The following objects are masked from 'package:dplyr':
##
## src, summarize
## The following objects are masked from 'package:base':
##
## format.pval, units
dataFill <- dataNA
dataFill$steps <- impute(dataNA$steps, fun=mean)
data3 <- aggregate(dataFill$steps, by = list(Date = dataFill$date), FUN ="sum")
data3 <- as.data.frame(data3)
colnames(data3) <- c("Date", "Sum_Steps")
head(data3)
hist(data3$Sum_Steps, col= "orange", breaks=36, xlab="No. of Steps", main= "Sum of Steps per Day")
mean_step <- mean(data3$Sum_Steps)
mean_step
## [1] 10766.19
median_step <- median(data3$Sum_Steps)
median_step
## [1] 10766.19
Weekday and Weekend
dataFill$date <- as.Date(dataFill$date)
dataFill$weekday <- weekdays(dataFill$date)
dataFill$weekend <- ifelse(dataFill$weekday=="Saturday" | dataFill$weekday=="Sunday", "Weekend", "Weekday" )
head(dataFill)
library(ggplot2)
data4 <- aggregate(dataFill$steps , by= list(dataFill$weekend, dataFill$interval), FUN = "mean") ##aggregating by interval, and weekend code.
data4 <- as.data.frame(data4)
colnames(data4) <- c("Weekend", "Interval", "Mean_Steps")
ggplot(data4, aes(x=Interval, y=Mean_Steps, color=Weekend)) +
geom_line()+
facet_grid(Weekend ~.) +
xlab("Interval") +
ylab("Mean Steps") +
ggtitle(" Average Number of Steps in Each Interval by Weekday/Weekend")
storm <- read.csv("./data/StormData.csv") ##reading in the data
str(storm)
## 'data.frame': 902297 obs. of 37 variables:
## $ STATE__ : num 1 1 1 1 1 1 1 1 1 1 ...
## $ BGN_DATE : chr "4/18/1950 0:00:00" "4/18/1950 0:00:00" "2/20/1951 0:00:00" "6/8/1951 0:00:00" ...
## $ BGN_TIME : chr "0130" "0145" "1600" "0900" ...
## $ TIME_ZONE : chr "CST" "CST" "CST" "CST" ...
## $ COUNTY : num 97 3 57 89 43 77 9 123 125 57 ...
## $ COUNTYNAME: chr "MOBILE" "BALDWIN" "FAYETTE" "MADISON" ...
## $ STATE : chr "AL" "AL" "AL" "AL" ...
## $ EVTYPE : chr "TORNADO" "TORNADO" "TORNADO" "TORNADO" ...
## $ BGN_RANGE : num 0 0 0 0 0 0 0 0 0 0 ...
## $ BGN_AZI : chr "" "" "" "" ...
## $ BGN_LOCATI: chr "" "" "" "" ...
## $ END_DATE : chr "" "" "" "" ...
## $ END_TIME : chr "" "" "" "" ...
## $ COUNTY_END: num 0 0 0 0 0 0 0 0 0 0 ...
## $ COUNTYENDN: logi NA NA NA NA NA NA ...
## $ END_RANGE : num 0 0 0 0 0 0 0 0 0 0 ...
## $ END_AZI : chr "" "" "" "" ...
## $ END_LOCATI: chr "" "" "" "" ...
## $ LENGTH : num 14 2 0.1 0 0 1.5 1.5 0 3.3 2.3 ...
## $ WIDTH : num 100 150 123 100 150 177 33 33 100 100 ...
## $ F : int 3 2 2 2 2 2 2 1 3 3 ...
## $ MAG : num 0 0 0 0 0 0 0 0 0 0 ...
## $ FATALITIES: num 0 0 0 0 0 0 0 0 1 0 ...
## $ INJURIES : num 15 0 2 2 2 6 1 0 14 0 ...
## $ PROPDMG : num 25 2.5 25 2.5 2.5 2.5 2.5 2.5 25 25 ...
## $ PROPDMGEXP: chr "K" "K" "K" "K" ...
## $ CROPDMG : num 0 0 0 0 0 0 0 0 0 0 ...
## $ CROPDMGEXP: chr "" "" "" "" ...
## $ WFO : chr "" "" "" "" ...
## $ STATEOFFIC: chr "" "" "" "" ...
## $ ZONENAMES : chr "" "" "" "" ...
## $ LATITUDE : num 3040 3042 3340 3458 3412 ...
## $ LONGITUDE : num 8812 8755 8742 8626 8642 ...
## $ LATITUDE_E: num 3051 0 0 0 0 ...
## $ LONGITUDE_: num 8806 0 0 0 0 ...
## $ REMARKS : chr "" "" "" "" ...
## $ REFNUM : num 1 2 3 4 5 6 7 8 9 10 ...
summary(storm$FATALITIES) ##identify the max/min number of fatalities 583 is max
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 0.0000 0.0000 0.0000 0.0168 0.0000 583.0000
summary(storm$INJURIES) ##1700 max injuries
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 0.0000 0.0000 0.0000 0.1557 0.0000 1700.0000
summary(storm$PROPDMG)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 0.00 0.00 0.00 12.06 0.50 5000.00
summary(storm$CROPDMG)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 0.000 0.000 0.000 1.527 0.000 990.000
##Taking subset
storm1 <- subset(storm, FATALITIES > 100 | INJURIES > 500)
storm2 <- storm1[, c("EVTYPE", "INJURIES", "FATALITIES") ]
storm3 <- subset(storm, PROPDMG > 4500 | CROPDMG > 900)
storm4 <- storm3[, c("EVTYPE", "PROPDMG", "CROPDMG")]
storm2
library(ggplot2)
plot1 <- ggplot(storm1, aes(EVTYPE, INJURIES, fill=EVTYPE))
inj <- plot1 + geom_bar(stat = "identity") +
theme_bw() +
theme(axis.text.x = element_text(angle = 40, vjust = 1, hjust=1)) +
xlab("Event Type")+
ylab("Total Injuries")+
ggtitle("Subset of Major Injuries by Event")
inj ##shows a bar graph, where tornado has the highest rate of injuries.
library(ggplot2)
plot2 <- ggplot(storm1, aes(EVTYPE, FATALITIES, fill=EVTYPE))
fat <- plot2 + geom_bar(stat = "identity") +
theme_bw() +
theme(axis.text.x = element_text(angle = 40, vjust = 1, hjust=1)) +
xlab("Event Type")+
ylab("Total Fatalities")+
ggtitle("Subset of Fatalities by Event")
fat ##Tornado also has the highest rates of fatalities..
library(ggplot2)
plot3 <- ggplot(storm3, aes(EVTYPE, CROPDMG, fill=EVTYPE))
cdmg <- plot3 + geom_bar(stat = "identity") +
theme_bw() +
theme(axis.text.x = element_text(angle = 40, vjust = 1, hjust=1)) +
xlab("Event Type")+
ylab("Damage")+
guides(fill = "none")+
ggtitle("Impact on Crops")
##shows a bar graph, where tornado has the highest rate of injuries.
library(ggplot2)
plot4 <- ggplot(storm3, aes(EVTYPE, PROPDMG, fill=EVTYPE))
pdmg <- plot4 + geom_bar(stat = "identity") +
theme_bw() +
theme(axis.text.x = element_text(angle = 40, vjust = 1, hjust=1)) +
xlab("Event Type")+
ylab("Damage")+
guides(fill = "none")+
ggtitle("Impact on Property")
library(cowplot)
title <- ggdraw() + draw_label("Economic Impacts of Major Storm Events", fontface='bold')
plotmain <- plot_grid(cdmg, pdmg, ncol=2, labels="AUTO")
finalplot <- plot_grid(title, plotmain, nrow=2, rel_heights = c(.2, 1, 1))
finalplot